# Module 3 Demo — Joins and Summaries
# Based on R4DS Chapters 12 (Tidy Data) and 13 (Relational Data)
# Hands-on practice with clinical data joining and reshaping

# Load required libraries
library(tidyverse)  # Includes dplyr, tidyr for joins and pivots
library(lubridate)  # Date manipulation

# ===============================
# Part 1: Tidy Data Principles (R4DS Ch. 12)
# ===============================

cat("=== TIDY DATA DEMONSTRATION ===\n")

# Create sample clinical data in WIDE format (not tidy)
vital_signs_wide <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004"),
  SYSBP_BL = c(120, 135, 115, 140),
  DIABP_BL = c(80, 85, 75, 90),
  PULSE_BL = c(72, 78, 68, 85),
  SYSBP_W4 = c(118, 130, 112, 135),
  DIABP_W4 = c(78, 82, 72, 88),
  PULSE_W4 = c(70, 75, 65, 82)
)

cat("\nWide format (NOT TIDY):\n")
print(vital_signs_wide)

# Convert to LONG format (TIDY) using pivot_longer
vital_signs_long <- vital_signs_wide %>%
  pivot_longer(
    cols = -USUBJID,
    names_to = c("PARAM", "VISIT"),
    names_sep = "_",
    values_to = "AVAL"
  ) %>%
  mutate(
    PARAM = case_when(
      PARAM == "SYSBP" ~ "Systolic Blood Pressure",
      PARAM == "DIABP" ~ "Diastolic Blood Pressure",
      PARAM == "PULSE" ~ "Pulse Rate"
    ),
    VISIT = case_when(
      VISIT == "BL" ~ "Baseline",
      VISIT == "W4" ~ "Week 4"
    )
  )

cat("\nLong format (TIDY - better for analysis):\n")
print(vital_signs_long)

# ===============================
# Part 2: Sample Clinical Domains (Relational Data)
# ===============================

cat("\n=== CLINICAL DOMAINS SETUP ===\n")

# Demographics (DM) - Primary dataset
dm <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005"),
  AGE = c(45, 67, 52, 71, 34),
  SEX = c("M", "F", "M", "F", "M"),
  ARMCD = c("TRT", "PBO", "TRT", "TRT", "PBO"),
  ARM = c("Treatment", "Placebo", "Treatment", "Treatment", "Placebo"),
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19")
) %>%
  mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))

cat("\nDemographics (DM):\n")
print(dm)

# Adverse Events (AE) - Related dataset
ae <- tibble(
  USUBJID = c("001-001", "001-001", "001-002", "001-004", "001-006"),
  AESEQ = c(1, 2, 1, 1, 1),
  AEDECOD = c("HEADACHE", "NAUSEA", "FATIGUE", "DIZZINESS", "HEADACHE"),
  AESEV = c("MILD", "MODERATE", "MILD", "SEVERE", "MILD"),
  AESTDTC = c("2024-01-20", "2024-01-22", "2024-01-21", "2024-01-25", "2024-02-01")
)

cat("\nAdverse Events (AE):\n")
print(ae)

# Vital Signs (VS) - Another related dataset
vs <- tibble(
  USUBJID = rep(c("001-001", "001-002", "001-003"), each = 3),
  VSTESTCD = rep(c("SYSBP", "DIABP", "PULSE"), 3),
  VSSTRESN = c(120, 80, 72, 135, 85, 78, 115, 75, 68),
  VISITNUM = rep(1, 9),
  VISIT = rep("Baseline", 9)
)

cat("\nVital Signs (VS):\n")
print(vs)

# ===============================
# Part 3: Join Operations (R4DS Ch. 13)
# ===============================

cat("\n=== JOIN OPERATIONS DEMONSTRATION ===\n")

# LEFT JOIN - Most common in clinical programming
cat("\n1. LEFT JOIN: Add demographics to adverse events\n")
ae_with_demo <- ae %>%
  left_join(dm, by = "USUBJID")

cat("AE with demographics (note: 001-006 has NA for demo variables):\n")
print(ae_with_demo)

# INNER JOIN - Complete cases only
cat("\n2. INNER JOIN: Keep only AEs for subjects with demographics\n")
ae_complete <- ae %>%
  inner_join(dm, by = "USUBJID")

cat("Complete AE data (excludes 001-006):\n")
print(ae_complete)

# SEMI JOIN - Filter to matching subjects
cat("\n3. SEMI JOIN: Find subjects who had any adverse events\n")
subjects_with_ae <- dm %>%
  semi_join(ae, by = "USUBJID")

cat("Subjects with AEs:\n")
print(subjects_with_ae)

# ANTI JOIN - Find non-matching records
cat("\n4. ANTI JOIN: Find subjects who had NO adverse events\n")
subjects_without_ae <- dm %>%
  anti_join(ae, by = "USUBJID")

cat("Subjects without AEs:\n")
print(subjects_without_ae)

# ===============================
# Part 4: Group By and Summarise
# ===============================

cat("\n=== GROUP BY AND SUMMARISE DEMONSTRATION ===\n")

# Basic AE summary by treatment
ae_summary <- ae_with_demo %>%
  filter(!is.na(ARMCD)) %>%  # Exclude subjects not in DM
  group_by(ARMCD) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    .groups = "drop"
  )

cat("\nAE summary by treatment:\n")
print(ae_summary)

# AE summary by elderly status and severity
elderly_ae_summary <- ae_with_demo %>%
  filter(!is.na(ELDERLY)) %>%
  group_by(ELDERLY, AESEV) %>%
  summarise(
    n_events = n(),
    n_subjects = n_distinct(USUBJID),
    .groups = "drop"
  ) %>%
  arrange(ELDERLY, AESEV)

cat("\nAE summary by elderly status and severity:\n")
print(elderly_ae_summary)

# ===============================
# Part 5: Pivot Operations with Clinical Data
# ===============================

cat("\n=== PIVOT OPERATIONS WITH VITAL SIGNS ===\n")

# Create a summary table in wide format for reporting
vs_summary <- vs %>%
  left_join(dm %>% select(USUBJID, ARMCD), by = "USUBJID") %>%
  group_by(ARMCD, VSTESTCD) %>%
  summarise(
    n = n(),
    mean_val = round(mean(VSSTRESN), 1),
    .groups = "drop"
  )

cat("\nVital signs summary (long format):\n")
print(vs_summary)

# Pivot to wide format for clinical report
vs_wide_summary <- vs_summary %>%
  pivot_wider(
    names_from = VSTESTCD,
    values_from = c(n, mean_val),
    names_sep = "_"
  )

cat("\nVital signs summary (wide format for reporting):\n")
print(vs_wide_summary)

# ===============================
# Part 6: Complex Clinical Analysis Pipeline
# ===============================

cat("\n=== COMPLEX ANALYSIS PIPELINE ===\n")

# Create a comprehensive analysis combining all concepts
clinical_analysis <- ae %>%
  # Join with demographics
  left_join(dm, by = "USUBJID") %>%
  # Filter to complete cases
  filter(!is.na(ARMCD)) %>%
  # Create analysis variables
  mutate(
    AESTDT = ymd(AESTDTC),
    RFSTDT = ymd(RFSTDTC),
    AEDY = as.numeric(AESTDT - RFSTDT) + 1,
    SEVERE_AE = ifelse(AESEV == "SEVERE", "Y", "N")
  ) %>%
  # Group and summarise
  group_by(ARMCD, ELDERLY, SEVERE_AE) %>%
  summarise(
    n_subjects = n_distinct(USUBJID),
    n_events = n(),
    mean_onset_day = round(mean(AEDY, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  # Arrange results
  arrange(ARMCD, ELDERLY, SEVERE_AE)

cat("\nComprehensive AE analysis:\n")
print(clinical_analysis)

# ===============================
# Module 3 Demo Complete!
# ===============================

cat("\n🎉 Module 3 Demo Complete!\n")
cat("You've practiced:\n")
cat("- Tidy data principles with pivot_longer() and pivot_wider()\n")
cat("- All types of joins: left, inner, semi, anti\n")
cat("- Group by and summarise operations\n")
cat("- Complex clinical data analysis pipelines\n")
cat("- Combining multiple domains (DM, AE, VS)\n")
cat("\nReady for advanced clinical data analysis!\n")
